#!/usr/bin/python
# author Kelvin Pan <ibmmc@live.com>
# 04/29/2012 17:13
from operator import itemgetter 

DATA_FILE='/tmp/www.csdn.net.sql'
TEST_DATA_FILE='./test.txt'

def cleaned_line(eachline):
    email_domain_name = eachline.strip().split()[-1].split('@')[-1].replace('___csdn_1','').lower()
    return filter(lambda x: ord(x)<128, email_domain_name)


def main():
    '''statistics of email domain from csdn600w'''
    txt = open(DATA_FILE)
    data_store = {}
    for eachline in txt:
        eachline = cleaned_line(eachline); 
        try:
            data_store[email_domain_name] = data_store[email_domain_name] + 1
        except Exception, e:
            data_store[email_domain_name] = 1

    data_store_after_sort = sorted(data_store.iteritems(),key=itemgetter(1),reverse=True)
    for domain,count in data_store_after_sort:
        print '%-40s %d' % (domain,count)

if __name__ == '__main__':
    main()
